This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
Loading package and dependencies:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(tidyr)
library(forcats)
library(stringr)
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
df1 <- read.csv('./kagglesurvey/kaggle_survey_2022_responses.csv')
head(df1$Q5)
## [1] "Are you currently a student? (high school, university, or graduate)"
## [2] "No"
## [3] "No"
## [4] "Yes"
## [5] "No"
## [6] "Yes"
df1 <- df1[-1,]
head(df1$Q5)
## [1] "No" "No" "Yes" "No" "Yes" "Yes"
## removing the first row
table(df1$Q5)
##
## No Yes
## 12036 11961
prop.table(table(df1$Q5))
##
## No Yes
## 0.5015627 0.4984373
#proportion of non-students to students
table(df1$Q3)
##
## Man Nonbinary Prefer not to say
## 18266 78 334
## Prefer to self-describe Woman
## 33 5286
as.data.frame(prop.table(table(df1$Q3)))
#ratio of gender
table(df1$Q7)
## < table of extent 0 >
# platform poularity
table(df1$Q12_1)
##
## Python
## 5344 18653
data.frame(prop.table(table(df1$Q12_1)))
df_index <- read.csv('./kagglesurvey/kaggle_survey_2022_responses.csv', header=0)
df_index[1:2,]
df_index <- transpose(df_index[1:2,])
helpful_platforms <- df1 %>% select(contains("Q7_"))
data.frame(x=unlist(helpful_platforms))
helpful_platforms <- helpful_platforms %>% mutate_all(na_if, "")
helpful_platforms <- data.frame(data.frame(x=unlist(helpful_platforms)))
data.frame(sort(table(helpful_platforms), decreasing = TRUE))
data.frame(sort(table(df1[,173]), decreasing=TRUE))
gender_pay <- df1 %>% select(c("Q3","Q29"))
#24,000 rows with NULLS
gender_pay <- gender_pay %>% mutate_all(na_if, "")
#clean NAs
gender_pay %>% group_by(Q3) %>% count(Q29)
#data.frame(table(gender_pay$Q29)) %>% arrange(Freq)
gender_pay %>% count(Q3)
gender_paygap <- gender_pay %>% filter(Q3 == "Man" | Q3 == "Woman")
gender_paygap %>% group_by(Q3) %>% count(Q29)
We are almost there, but the dollar signs are messing with the hierarchies…
gender_paygap %>% count(Q29)
gender_paygap %>% mutate(across(starts_with("Q29"), ~gsub("\\$", "", .))) %>% count(Q29) %>% arrange(Q29)
gender_paygap %>% mutate(across(starts_with("Q29"), ~gsub("\\$", "", .))) %>% mutate(across(starts_with("Q29"), ~gsub("\\,", "", .))) %>% count(Q29) %>% arrange(Q29)
gender_paygap <- gender_paygap %>% mutate(across(starts_with("Q29"), ~gsub("\\$", "", .))) %>% mutate(across(starts_with("Q29"), ~gsub("\\,", "", .)))
# stripping out "$" and "commas"
gender_paygap_range <- gender_paygap %>% separate(Q29, c("MINsal", "MAXsal"))
#string split by "-"
millionaires <- gender_paygap %>% filter(Q29 == ">1000000")
#as.numeric(gender_paygap_range$MINsal)
as.numeric(millionaires$Q29)
## Warning: NAs introduced by coercion
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
#million+ earners would get NA-coerced by data type transformation
gender_paygap <- gender_paygap %>% filter(Q29 != ">1000000")
#remove millionaires
gender_paygap <- gender_paygap %>% separate(Q29, c("MINsal", "MAXsal"))
#string split
gender_paygap$MINsal <- as.numeric(gender_paygap$MINsal)
gender_paygap$MAXsal <- as.numeric(gender_paygap$MAXsal)
#convert to numeric data
salary_data <- data.frame(gender_paygap)
salary_data
gender_paygap %>%
mutate(
salary_range = paste0 (
format(MINsal, trim = TRUE),
"-",
MAXsal
),
salary_range = fct_reorder(salary_range, MINsal)
)
gender_paygap <- gender_paygap %>%
mutate(
salary_range = paste0 (
format(MINsal, trim = TRUE),
"-",
MAXsal
),
salary_range = fct_reorder(salary_range, MINsal)
)
#drop the NAs, then rename gender column
gender_paygap <- gender_paygap %>% drop_na()
gender_paygap <- gender_paygap %>% rename("gender" = "Q3")
gender_paygap %>% arrange(desc(salary_range))
aggr_gpg <- gender_paygap %>% group_by(gender) %>% count(salary_range)
ggplot(aggr_gpg, aes(x=salary_range, y=n, fill=gender)) +
geom_bar(data=subset(aggr_gpg, gender == "Man"), stat="identity") +
geom_bar(data=subset(aggr_gpg, gender == "Woman"), stat="identity", aes(y=-n)) +
coord_flip() + scale_fill_manual(values=c("lightblue", "pink"))
gender_paygap %>% group_by(gender) %>% summarise(count = n()/nrow(.))
#gender ratio of gender_paygap df
gender_demo <- gender_paygap %>% group_by(gender) %>% summarise(percentage = round(n()/nrow(.),4)*100, lab.pos = cumsum(percentage)-.5*percentage)
ggplot(gender_demo, aes(x=1, y=percentage, fill=gender)) +
geom_bar(stat="identity") +
coord_polar("y", start = 0) +
geom_text(aes(y = lab.pos, label = paste(percentage,"%", sep = "")), col = "white") + theme_void() +
scale_fill_manual(values=c("lightblue", "lightpink")) +
xlim(-1, 2.5)
#ggsave("gender_demo_white.png")
aggr_gpg2 <- gender_paygap %>% group_by(salary_range, gender) %>% summarise(n = n()) %>% mutate(freq = n /sum(n))
## `summarise()` has grouped output by 'salary_range'. You can override using the
## `.groups` argument.
ggplot(aggr_gpg2, aes(x=salary_range, y=freq, fill=gender)) +
geom_bar(data=subset(aggr_gpg2, gender == "Man"), stat="identity") +
geom_bar(data=subset(aggr_gpg2, gender == "Woman"), stat="identity", aes(y=-freq)) +
geom_hline(yintercept = 0, linetype="dotted", alpha=0.6) +
# Accuracy of y-axis
scale_y_continuous( labels=c("30%","0%","30%","60%","90%")) +
coord_flip() + scale_fill_manual(values=c("lightblue", "pink")) + theme_minimal() +
labs(x = "salary range (USD)", y = "percent share")
#ggsave("gendersalaryratio.png")
gender_paygap %>% group_by(salary_range, gender) %>% summarise(n = n()) %>% mutate(freq = n /sum(n))
## `summarise()` has grouped output by 'salary_range'. You can override using the
## `.groups` argument.
gender_paygap %>% group_by(gender) %>% summarise(n = n()) %>% mutate(freq = n /sum(n))
#gender ratio and relative group percentage
df1 %>% select(Q23) %>% count(Q23)
gender_job <- df1 %>% select(c("Q3", "Q23"))
gender_job <- gender_job %>% mutate_all(na_if, "")
gender_job <- gender_job %>% filter(Q3 == "Man" | Q3 == "Woman")
gender_job <- gender_job %>% filter(Q23 != "Currently not employed" & Q23 != "Other")
gender_job <- gender_job %>% drop_na()
#house cleaning code chunk
gender_job <- gender_job %>% mutate(Q23 = replace(Q23, Q23 == "Data Analyst (Business, Marketing, Financial, Quantitative, etc)", "Data Analyst")) %>% mutate(Q23 = replace(Q23, Q23 == "Manager (Program, Project, Operations, Executive-level, etc)", "Manager"))
#simplifying strings of some job roles
gender_job <- gender_job %>% rename("gender" = "Q3", "job_title" = "Q23")
gender_job %>% select(job_title) %>% count(job_title) %>% arrange(desc(n))
gender_job %>% group_by(job_title) %>% summarise(n = n()) %>% mutate(freq = n /sum(n)) %>% arrange(desc(n))
gender_job_ratio <- gender_job %>% group_by(job_title, gender) %>% summarise(n = n()) %>% mutate(freq = n /sum(n))
## `summarise()` has grouped output by 'job_title'. You can override using the
## `.groups` argument.
gender_job %>% group_by(job_title, gender) %>% summarise(n = n()) %>% mutate(freq = n /sum(n))
## `summarise()` has grouped output by 'job_title'. You can override using the
## `.groups` argument.
ggplot(gender_job_ratio, aes(x=job_title, y=freq, fill=gender)) +
geom_bar(data=subset(gender_job_ratio, gender == "Man"), stat="identity") +
geom_bar(data=subset(gender_job_ratio, gender == "Woman"), stat="identity", aes(y=-freq)) +
geom_hline(yintercept = 0, linetype="dotted", alpha=0.6) +
# Accuracy of y-axis
scale_y_continuous(breaks=c(-0.25,0,0.25,0.5,0.75),labels=c("25%", "0%", "25%","50%", "75%")) +
coord_flip() + scale_fill_manual(values=c("lightblue", "pink")) + theme_minimal() +
labs(x="job title", y="percent share")
#ggsave("genderjobratio.png")
cc_usage <- df1 %>% select(contains(c("Q31")))
cc_enjoyability <- df1 %>% select(Q32)
cc_enjoyability %>% count(Q32) %>% arrange(desc(n))
cc_usage %>% gather("key", "value") %>% group_by(value) %>% summarise(n=n()) %>% arrange(desc(n))
cc_spending <- df1 %>% select(c(Q30, Q23))
df1 %>% select(contains(c("Q44"))) %>% gather("key", "value") %>% group_by(value) %>% summarise(n=n()) %>% arrange(desc(n))
### distribution of salary range
salary_range_dist <- gender_paygap %>% select(salary_range) %>% count(salary_range)
ggplot(salary_range_dist, aes(x=salary_range, y=n)) +
geom_col(fill="skyblue") +
coord_flip() + theme_minimal()
plot above is skewed, need to consolidate to USA-only
next, tidying up data
clean_sal <- df1 %>% select(c("Q4","Q29")) %>% mutate(across(starts_with("Q29"), ~gsub("\\$", "", .))) %>% mutate(across(starts_with("Q29"), ~gsub("\\,", "", .)))
#strip out $ and commas
clean_sal <- clean_sal %>% filter(Q29 != ">1000000")
#remove millionaires
clean_sal<- clean_sal %>% separate(Q29, c("MINsal", "MAXsal"))
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 15861 rows [1, 2,
## 3, 5, 6, 7, 10, 11, 12, 13, 15, 16, 22, 23, 24, 25, 27, 29, 31, 32, ...].
#string split by "-"
clean_sal
bind clean salaries
global_sal_range <- clean_sal
global_sal_range
usa_sal_range <- global_sal_range %>% filter(Q4 == "United States of America")
usa_sal_range <- usa_sal_range %>% drop_na()
#drop na, then convert columns to numeric
usa_sal_range$MINsal <- as.numeric(usa_sal_range$MINsal)
usa_sal_range$MAXsal <- as.numeric(usa_sal_range$MAXsal)
#after, create factor column corresponding to original salary range columns
usa_sal_range <- usa_sal_range %>%
mutate(
salary_range = paste0 (
format(MINsal, trim = TRUE),
"-",
MAXsal
),
salary_range = fct_reorder(salary_range, MINsal)
)
after all that, now we can attempt to visualize
usa_sal_range
usa_salary <- usa_sal_range %>% select(salary_range) %>% count(salary_range)
ggplot(usa_salary, aes(x=salary_range, y=n)) +
geom_col(fill="skyblue") +
coord_flip() + theme_minimal() + labs(x="salary range, annual (USD)", y="count")
usa_salary
#filter for non-USA countries, then drop_na
not_usa_salary <- global_sal_range %>% filter(Q4 != "United States of America")
not_usa_salary <- not_usa_salary %>% drop_na()
#convert to numeric dtype
not_usa_salary$MINsal <- as.numeric(not_usa_salary$MINsal)
not_usa_salary$MAXsal <- as.numeric(not_usa_salary$MAXsal)
#after, create factor column corresponding to original salary range columns
not_usa_salary <- not_usa_salary %>%
mutate(
salary_range = paste0 (
format(MINsal, trim = TRUE),
"-",
MAXsal
),
salary_range = fct_reorder(salary_range, MINsal)
)
#convert values to "rest of world"
not_usa_salary['Q4'] <- "Rest of World"
#combine the two dataframes by row (not by column)
usa_v_row_sal <- rbind(usa_sal_range, not_usa_salary)
usa_v_row_sal <- usa_v_row_sal %>% rename("country" = "Q4")
usa_v_row_sal
attempt to plot histogram with USA salary range vs Rest of World
ggplot(usa_v_row_sal, aes(x=salary_range, fill=country, color=country)) +
geom_bar(color="white", width=1, linewidth=0) + coord_flip() + theme_minimal() +
scale_fill_manual(values=c("grey90","skyblue")) +
labs(x="salary range, annual (USD)", y="count") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = c(0.87, 0.25))
ggsave("salary_distribution.png")
## Saving 8 x 6 in image
gender_job
gender_paygap
salrange_density <- df1 %>% select("Q3","Q4", "Q23", "Q29")
salrange_density <- salrange_density %>%
#filter(Q4 == "United States of America") %>% #filter to US salaries only
filter(Q3 == "Man" | Q3 == "Woman") %>% #confine to man/woman
filter(Q23 != "Currently not employed" & Q23 != "Other") %>% #remove unemployed/other
filter(Q29 != ">$1,000,000") %>% #remove millionaires
mutate_all(na_if, "") #fill blank data with NA
salrange_density <- salrange_density %>% filter_at(vars(Q23, Q29), all_vars(!is.na(.)))
#remove incomplete rows (left with ~7,000 rows)
salrange_density <- salrange_density %>% mutate(Q23 = replace(Q23, Q23 == "Data Analyst (Business, Marketing, Financial, Quantitative, etc)", "Data Analyst")) %>% mutate(Q23 = replace(Q23, Q23 == "Manager (Program, Project, Operations, Executive-level, etc)", "Manager"))
#simplifying strings of some job roles
#split salary range column(Q29), turn to numeric dtype, then add factor type column
salrange_density <- salrange_density %>% mutate(across(starts_with("Q29"), ~gsub("\\$", "", .))) %>% mutate(across(starts_with("Q29"), ~gsub("\\,", "", .)))
#strip out $ and commas
salrange_density <- salrange_density %>% separate(Q29, c("MINsal", "MAXsal"))
#string split
salrange_density$MINsal <- as.numeric(salrange_density$MINsal)
salrange_density$MAXsal <- as.numeric(salrange_density$MAXsal)
#as numeric conversion
salrange_density <- salrange_density %>%
mutate(
salary_range = paste0 (
format(MINsal, trim = TRUE),
"-",
MAXsal
),
salary_range = fct_reorder(salary_range, MINsal)
)
#re-combine newly separated column and create a corresponding factor column
salrange_density <- salrange_density %>% rename("gender" = "Q3", "job_title" = "Q23")
head(salrange_density)
#col_grid <- rgb(235, 235, 235, 100, maxColorValue = 300)
visualization attempt
ggplot(salrange_density, aes(x=job_title, y=salary_range)) +
geom_jitter(aes(colour=gender), height = 0.3, width = 0.3, alpha=0.8)+
scale_color_manual(values=c("deepskyblue","deeppink")) +
labs(x="job title", y="salary range, annual (USD)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust=1),
axis.text = element_text(size=rel(2)),
axis.title = element_text(size=rel(1)))
#ggsave("whiteUSsalary_density.png", width = 7000, height = 4096, units="px")
df1 %>% select(Q2) %>% gather("key", "value") %>% group_by(value) %>% summarise(n=n()) %>% arrange((value))
age_demo <- df1 %>% select(Q2) %>% gather("key", "value") %>% group_by(value) %>% summarise(n=n()) %>% arrange((value))
ggplot(age_demo, aes(x=n, y=value)) +
geom_col(fill="gold") +
scale_x_continuous(expand = c(0, 0)) +
scale_x_reverse() +
scale_y_discrete(position="right") +
geom_vline(xintercept = 0, linetype="dashed") +
theme_minimal() +
labs(x="count", y="") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
#ggsave("age_demographics.png")
age_demo
df1 %>% select(contains("Q12_")) %>% gather("key", "value") %>% mutate_all(na_if, "") %>% drop_na() %>% group_by(value) %>% summarise(n=n()) %>% arrange(desc(n))
pop_language <- df1 %>% select(contains("Q12_")) %>% gather("key", "value") %>% mutate_all(na_if, "") %>% drop_na() %>% group_by(value) %>% summarise(n=n()) %>% arrange(desc(n))
reformat keys to simplify our donut chart
pop_language <- pop_language %>% mutate(value = ifelse(n < 3000, "Other*", value)) %>% group_by(value) %>% summarise(n = sum(n)) %>% arrange(desc(n))
pop_language
pop_language$fraction <- pop_language$n / sum(pop_language$n)
pop_language$ymax <- cumsum(pop_language$fraction)
pop_language$ymin <- c(0, head(pop_language$ymax, n=-1))
# Compute label position
pop_language$labelPosition <- (pop_language$ymax + pop_language$ymin) / 2
pop_language$label <- paste0(pop_language$category, "\n value: ", pop_language$count)
## Warning: Unknown or uninitialised column: `category`.
## Warning: Unknown or uninitialised column: `count`.
making the plot
ggplot(pop_language, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=value)) +
geom_rect() +
geom_label(x=3.5, aes(y=labelPosition, label=value)) +
#geom_label( x=3.5, aes(y=labelPosition, label=label), size=6) +
scale_fill_brewer(palette= "Set2") +
coord_polar(theta="y") +
xlim(c(0, 4)) +
theme_void() +
theme(legend.position = "none") +
labs(caption="*Other includes: C#, MATLAB, Bash, PHP, Go, and Julia")
#ggsave("donutchart_lang.png")
pop_language